import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd
import pickle

atp_tennis = pd.read_csv(os.path.join(sys.argv[1], 'atp_tennis.csv'))

# Calculate the sum of missing (null) values in each column
missing_values = atp_tennis.isnull().sum()

print(missing_values)
# pickle.dump(missing_values,open("./ref_result/missing_values.pkl","wb"))

import pandas as pd
import pickle


# Calculate the number of unique values for specific columns
# Columns considered are 'Tournament', 'Series', 'Court', 'Surface', 'Round'
unique_values = atp_tennis[['Tournament', 'Series', 'Court', 'Surface', 'Round']].nunique() 

print(unique_values)
# pickle.dump(unique_values,open("./ref_result/unique_values.pkl","wb"))

import pandas as pd
import pickle


# Create a DataFrame to store the data quality report, with columns for missing values and unique values
data_quality_report = pd.DataFrame({'Missing Values': missing_values, 'Unique Values': unique_values})    

print(data_quality_report)
# pickle.dump(data_quality_report,open("./ref_result/data_quality_report.pkl","wb"))

import pandas as pd
import pickle


# Count the number of wins for each player
player_wins = atp_tennis['Winner'].value_counts()
  
# Count the total number of matches played by each player
player_matches = atp_tennis['Player_1'].value_counts() + atp_tennis['Player_2'].value_counts()
  
# Calculate win/loss ratio for each player
win_loss_ratios = player_wins / (player_matches - player_wins)
  
# Add win/loss ratios to the dataset
atp_tennis['Win_Loss_Ratio_1'] = atp_tennis['Player_1'].map(win_loss_ratios)
atp_tennis['Win_Loss_Ratio_2'] = atp_tennis['Player_2'].map(win_loss_ratios)

# Combine Rank_1, Rank_2, Win_Loss_Ratio_1, and Win_Loss_Ratio_2 into single columns   
combined_ranks = pd.concat([atp_tennis['Rank_1'], atp_tennis['Rank_2']]).reset_index(drop=True)   
combined_win_loss_ratios = pd.concat([atp_tennis['Win_Loss_Ratio_1'], atp_tennis['Win_Loss_Ratio_2']]).reset_index(drop=True)   

# Create a DataFrame with combined ranks and win/loss ratios   
combined_data = pd.DataFrame({'Rank': combined_ranks, 'Win_Loss_Ratio': combined_win_loss_ratios})  

print(combined_data)
# pickle.dump(combined_data,open("./ref_result/combined_data.pkl","wb"))

import pandas as pd
import numpy as np
import pickle


# Calculate and store descriptive statistics for the DataFrame
desc_stats = combined_data.describe()

# Print the descriptive statistics to the console
print(desc_stats)
# pickle.dump(desc_stats,open("./ref_result/desc_stats.pkl","wb"))

import pandas as pd
import matplotlib.pyplot as plt 
import pickle


# Histogram for Player Rankings Distribution
fig1, ax1 = plt.subplots()
ax1.hist(combined_ranks, bins=50, alpha=0.5)
ax1.set_title('Player Rankings Distribution')
ax1.set_xlabel('Ranking')
plt.savefig('./ref_result/hist_chart.png')
# plt.show()

